# load packages
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(plotly)
library(reshape2)
library(car)
Setting up Data
# import dataset
female <- read_csv("/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/Data/wm.csv")
# get a glimpse of dataset
head(female)
## # A tibble: 6 × 458
## HH1 HH2 LN WM1 WM2 WM3 WMINT WM4 WM5 WM6D WM6M WM6Y WM8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 3 1 2 3 92 91 92 19 11 2020 2
## 2 1 4 2 1 4 2 92 91 92 18 11 2020 1
## 3 1 9 4 1 9 4 92 91 92 18 11 2020 1
## 4 1 10 4 1 10 4 92 91 92 18 11 2020 2
## 5 1 11 4 1 11 4 92 91 92 19 11 2020 2
## 6 1 11 5 1 11 5 92 91 92 18 11 2020 2
## # ℹ 445 more variables: WM9 <dbl>, WM17 <dbl>, WM7H <dbl>, WM7M <dbl>,
## # WM10H <dbl>, WM10M <dbl>, WM11 <dbl>, WM12 <dbl>, WM13 <dbl>, WM14 <dbl>,
## # WM15 <dbl>, WM22 <dbl>, WM23 <dbl>, WM24 <dbl>, WMHINT <dbl>, WMFIN <dbl>,
## # WB3M <dbl>, WB3Y <dbl>, WB4 <dbl>, WB5 <dbl>, WB6A <dbl>, WB6B <dbl>,
## # WB7 <dbl>, WB9 <lgl>, WB10A <lgl>, WB10B <lgl>, WB11 <lgl>, WB12A <lgl>,
## # WB12B <lgl>, WB14 <dbl>, WB15 <dbl>, WB16 <dbl>, WB17 <dbl>, WB18 <dbl>,
## # WB19A <chr>, WB19B <chr>, WB19C <chr>, WB19D <chr>, WB19E <chr>, …
# Select the specified columns to create a new dataframe
female_df <- select(female, WAGEM, HH6, HH7, MSTATUS, welevel, insurance, ethnicity, windex5, CP2, HA1, MT4, MT9, MT11)
# View the first few rows of the new dataframe
summary(female_df)
## WAGEM HH6 HH7 MSTATUS welevel
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :0.00
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.00
## Median :20.00 Median :2.000 Median :3.000 Median :1.000 Median :2.00
## Mean :20.92 Mean :1.683 Mean :3.404 Mean :1.413 Mean :2.46
## 3rd Qu.:23.00 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:1.000 3rd Qu.:3.00
## Max. :47.00 Max. :2.000 Max. :6.000 Max. :9.000 Max. :9.00
## NA's :2026 NA's :11 NA's :11 NA's :11 NA's :524
## insurance ethnicity windex5 CP2
## Min. :1.000 Min. :1.000 Min. :0.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :2.000 Median :1.000
## Mean :1.135 Mean :2.035 Mean :2.494 Mean :1.431
## 3rd Qu.:1.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :9.000 Max. :6.000 Max. :5.000 Max. :9.000
## NA's :524 NA's :864
## HA1 MT4 MT9 MT11
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## Median :1.000 Median :2.000 Median :1.000 Median :1.00
## Mean :1.215 Mean :1.664 Mean :1.365 Mean :1.12
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.00
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.00
## NA's :524 NA's :524 NA's :2496 NA's :524
# Rename the columns
female_df <- female_df %>%
rename(
age_first_marriage = WAGEM,
area = HH6,
region = HH7,
marital_status = MSTATUS,
education_level = welevel,
health_insurance = insurance,
ethnicity = ethnicity,
wealth_index = windex5,
# contraceptive_decision_maker = CP5, #R
current_contraceptive_use = CP2,
# age_first_sexual_intercourse = SB1, #R
# currently_married = MA1,
# partner_age = MA2, #R
awareness_hiv_aids = HA1,
used_computer_tablet = MT4,
used_internet = MT9,
owns_mobile_phone = MT11
)
# Summarize missing values by column
summarize_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(summarize_missing)
## age_first_marriage area region
## 2026 11 11
## marital_status education_level health_insurance
## 11 524 524
## ethnicity wealth_index current_contraceptive_use
## 0 0 864
## awareness_hiv_aids used_computer_tablet used_internet
## 524 524 2496
## owns_mobile_phone
## 524
# Recode the value 9 to NA for specified variables
female_df <- female_df %>%
mutate(
current_contraceptive_use = na_if(current_contraceptive_use, 9),
used_internet = na_if(used_internet, 9),
health_insurance = na_if(health_insurance, 9),
education_level = na_if(education_level, 9),
awareness_hiv_aids = na_if(awareness_hiv_aids, 9),
used_computer_tablet = na_if(used_computer_tablet, 9),
owns_mobile_phone = na_if(owns_mobile_phone, 9),
marital_status = na_if(marital_status, 9)
)
# Recode the value 6 to NA for 'ethnicity'
female_df$ethnicity <- na_if(female_df$ethnicity, 6)
# Recoding variables from 1 (Yes) and 2 (No) to 1 (Yes) and 0 (No)
female_df$health_insurance <- ifelse(female_df$health_insurance == 2, 0, female_df$health_insurance)
female_df$current_contraceptive_use <- ifelse(female_df$current_contraceptive_use == 2, 0, female_df$current_contraceptive_use)
female_df$awareness_hiv_aids <- ifelse(female_df$awareness_hiv_aids == 2, 0, female_df$awareness_hiv_aids)
female_df$used_computer_tablet <- ifelse(female_df$used_computer_tablet == 2, 0, female_df$used_computer_tablet)
female_df$owns_mobile_phone <- ifelse(female_df$owns_mobile_phone == 2, 0, female_df$owns_mobile_phone)
female_df$used_internet <- ifelse(female_df$used_internet == 2, 0, female_df$used_internet)
# View the changes to ensure NA substitution has been correctly applied
summary(female_df)
## age_first_marriage area region marital_status
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
## Median :20.00 Median :2.000 Median :3.000 Median :1.000
## Mean :20.92 Mean :1.683 Mean :3.404 Mean :1.408
## 3rd Qu.:23.00 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:1.000
## Max. :47.00 Max. :2.000 Max. :6.000 Max. :3.000
## NA's :2026 NA's :11 NA's :11 NA's :18
## education_level health_insurance ethnicity wealth_index
## Min. :0.00 Min. :0.0000 Min. :1.000 Min. :0.000
## 1st Qu.:1.00 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.00 Median :1.0000 Median :1.000 Median :2.000
## Mean :2.46 Mean :0.8659 Mean :1.586 Mean :2.494
## 3rd Qu.:3.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.00 Max. :1.0000 Max. :4.000 Max. :5.000
## NA's :525 NA's :525 NA's :1148
## current_contraceptive_use awareness_hiv_aids used_computer_tablet
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.5842 Mean :0.7975 Mean :0.3412
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :885 NA's :541 NA's :532
## used_internet owns_mobile_phone
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000
## Mean :0.6394 Mean :0.8831
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## NA's :2501 NA's :529
Distributions of Data
# Histogram for 'Age at First Marriage'
afm_hist <- ggplot(female_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "blue", color = "white") +
theme_minimal() +
ggtitle("Histogram of Age at First Marriage") +
xlab("Age at First Marriage") +
ylab("Frequency")
print(afm_hist)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Saving the histogram
#ggsave("histogram_age_first_marriage.png", plot = afm_hist, width = 8, height = 6, dpi = 300)
# Plotting a boxplot for 'Age at First Marriage'
ggplot(female_df, aes(y = age_first_marriage)) +
geom_boxplot(fill = "cyan") +
labs(title = "Boxplot of Age at First Marriage", y = "Age at First Marriage") +
theme_minimal()
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Bar plot for 'used_internet'
ggplot(female_df, aes(x = used_internet)) +
geom_bar(fill = "yellow") +
theme_minimal() +
ggtitle("Bar Plot of Used Internet") +
xlab("Used Internet") +
ylab("Count")
## Warning: Removed 2501 rows containing non-finite outside the scale range
## (`stat_count()`).

Handling Missing Data
# Before that, let's double check the count of missing values by column
count_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(count_missing)
## age_first_marriage area region
## 2026 11 11
## marital_status education_level health_insurance
## 18 525 525
## ethnicity wealth_index current_contraceptive_use
## 1148 0 885
## awareness_hiv_aids used_computer_tablet used_internet
## 541 532 2501
## owns_mobile_phone access_to_media
## 529 529
# Make a copy of female_df for imputation
imputed_df <- female_df
# Handle missing values in 'Age at First Marriage'
# Calculate the median value for 'Age at First Marriage', excluding NA values
median_age_first_marriage <- median(imputed_df$age_first_marriage, na.rm = TRUE)
# Impute missing values in 'Age at First Marriage' with the median value
imputed_df$age_first_marriage[is.na(imputed_df$age_first_marriage)] <- median_age_first_marriage
# Define a function to calculate mode for categorical variables
getMode <- function(v) {
# The mode is the value that appears most frequently in the data
uniqv <- unique(na.omit(v)) # Omit NA values and get unique values
uniqv[which.max(tabulate(match(v, uniqv)))] # Return the value with the highest frequency
}
# For 'ethnicity', an ordinal variable with predefined categories, it makes sense to impute missing values with the mode.
mode_ethnicity <- getMode(imputed_df$ethnicity)
imputed_df <- mutate(imputed_df, ethnicity = ifelse(is.na(ethnicity), mode_ethnicity, ethnicity))
# Binary variables like 'current_contraceptive_use', 'health_insurance','awareness_hiv_aids', 'used_internet', 'used_computer_tablet', 'owns_mobile_phone', and 'access_to_media'
# should be imputed with the mode since it represents the most frequent category (either 0 or 1).
# Calculate the mode for each binary variable
mode_used_internet <- getMode(imputed_df$used_internet)
mode_current_contraceptive_use <- getMode(imputed_df$current_contraceptive_use)
mode_health_insurance <- getMode(imputed_df$health_insurance)
mode_awareness_hiv_aids <- getMode(imputed_df$awareness_hiv_aids)
mode_used_computer_tablet <- getMode(imputed_df$used_computer_tablet)
mode_owns_mobile_phone <- getMode(imputed_df$owns_mobile_phone)
mode_access_to_media <- getMode(imputed_df$access_to_media)
# Impute missing values for binary variables
imputed_df <- mutate(imputed_df,
used_internet = ifelse(is.na(used_internet), mode_used_internet, used_internet),
current_contraceptive_use = ifelse(is.na(current_contraceptive_use), mode_current_contraceptive_use, current_contraceptive_use),
health_insurance = ifelse(is.na(health_insurance), mode_health_insurance, health_insurance),
awareness_hiv_aids = ifelse(is.na(awareness_hiv_aids), mode_awareness_hiv_aids, awareness_hiv_aids),
used_computer_tablet = ifelse(is.na(used_computer_tablet), mode_used_computer_tablet, used_computer_tablet),
owns_mobile_phone = ifelse(is.na(owns_mobile_phone), mode_owns_mobile_phone, owns_mobile_phone),
access_to_media = ifelse(is.na(access_to_media), mode_access_to_media, access_to_media)
)
# 'education_level' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Primary", "Secondary"), using the mode may still be appropriate.
mode_education_level <- getMode(imputed_df$education_level)
imputed_df <- mutate(imputed_df, education_level = ifelse(is.na(education_level), mode_education_level, education_level))
# Check the resulting dataset to confirm changes
summary(imputed_df)
## age_first_marriage area region marital_status
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
## Median :20.00 Median :2.000 Median :3.000 Median :1.000
## Mean :20.76 Mean :1.683 Mean :3.404 Mean :1.408
## 3rd Qu.:23.00 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:1.000
## Max. :47.00 Max. :2.000 Max. :6.000 Max. :3.000
## NA's :11 NA's :11 NA's :18
## education_level health_insurance ethnicity wealth_index
## Min. :0.000 Min. :0.0000 Min. :1.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :1.0000 Median :1.000 Median :2.000
## Mean :2.438 Mean :0.8721 Mean :1.527 Mean :2.494
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.000 Max. :1.0000 Max. :4.000 Max. :5.000
##
## current_contraceptive_use awareness_hiv_aids used_computer_tablet
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.6168 Mean :0.8072 Mean :0.3251
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## used_internet owns_mobile_phone access_to_media
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.7192 Mean :0.8886 Mean :0.9071
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
# After imputation, let's check for missing values
count_imputation <- sapply(imputed_df, function(x) sum(is.na(x)))
print(count_imputation)
## age_first_marriage area region
## 0 11 11
## marital_status education_level health_insurance
## 18 0 0
## ethnicity wealth_index current_contraceptive_use
## 0 0 0
## awareness_hiv_aids used_computer_tablet used_internet
## 0 0 0
## owns_mobile_phone access_to_media
## 0 0
Visualization Comparing Before vs. After Imputation
# Histogram for 'age_first_marriage' before imputation
afm_0 <- ggplot(female_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "lightpink", color = "white", bins = 30) +
theme_light() +
ggtitle("Age at First Marriage Before Imputation") +
xlab("Age at First Marriage") +
ylab("Frequency")
# Histogram for 'age_first_marriage' after imputation
afm_imputed <- ggplot(imputed_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "plum", color = "white", bins = 30) +
theme_light() +
ggtitle("Age at First Marriage After Imputation") +
xlab("Age at First Marriage") +
ylab("Frequency")
# Arrange the two plots side by side
grid.arrange(afm_0, afm_imputed, ncol = 2)
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).

Creating Binary Variables for Child Marriage Under 18 and 15
# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
imputed_df <- imputed_df %>%
mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 15
# The new variable "child_marriage_u15" will have a value of 1 if the marriage occurred before age 15, and 0 otherwise
imputed_df <- imputed_df %>%
mutate(child_marriage_u15 = ifelse(age_first_marriage < 15, 1, 0))
# Move "child_marriage" and "child_marriage_u15" to the front of the dataframe
imputed_df <- imputed_df %>%
select(child_marriage, child_marriage_u15, everything())
Preparing for Regression Analysis
Correlation Matrix
# Calculate correlation matrix
cor_matrix <- cor(imputed_df %>% select_if(is.numeric), use = "complete.obs")
# Melt the correlation matrix
melted_cor_matrix <- melt(cor_matrix)
# Generate an interactive heatmap
corr_matrix <- ggplot(melted_cor_matrix, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradientn(
colours = c("deepskyblue", "white", "red2"),
values = scales::rescale(c(-1, 0, 1)),
limits = c(-1, 1),
name="Pearson\nCorrelation"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("") +
ylab("") +
ggtitle("Correlation Matrix")
# Convert ggplot object to plotly for interactivity
ggplotly(corr_matrix)
Accessing Multicollinearity
# Model with all predictors
model_for_vif <- lm(child_marriage ~ area + region + marital_status + education_level + health_insurance + ethnicity + wealth_index + current_contraceptive_use + awareness_hiv_aids + access_to_media, data = imputed_df)
# Calculate VIF (A VIF value > 5 indicates high multicollinearity)
vif_results <- vif(model_for_vif)
print(vif_results)
## area region marital_status
## 1.276710 1.111376 1.556525
## education_level health_insurance ethnicity
## 1.931734 1.040051 1.429007
## wealth_index current_contraceptive_use awareness_hiv_aids
## 1.774619 1.501796 1.556100
## access_to_media
## 1.251097
Converting Categorical and Binary Variables to Factors
# Convert nominal and ordinal variables to factors
imputed_df$area <- as.factor(imputed_df$area)
imputed_df$region <- as.factor(imputed_df$region)
imputed_df$marital_status <- as.factor(imputed_df$marital_status)
imputed_df$education_level <- factor(imputed_df$education_level, ordered = TRUE)
imputed_df$ethnicity <- as.factor(imputed_df$ethnicity)
imputed_df$wealth_index <- factor(imputed_df$wealth_index, ordered = TRUE)
# Binary variables are already in the correct format and can be used as is
Baseline Logistic Regression Model
baseline_model <- glm(child_marriage ~ area + region + marital_status + education_level + health_insurance + ethnicity + wealth_index + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = imputed_df)
# Check the model summary to interpret the coefficients
summary(baseline_model)
##
## Call:
## glm(formula = child_marriage ~ area + region + marital_status +
## education_level + health_insurance + ethnicity + wealth_index +
## current_contraceptive_use + awareness_hiv_aids + access_to_media,
## family = binomial(), data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.69204 0.19574 -13.753 < 2e-16 ***
## area2 0.23405 0.08675 2.698 0.006972 **
## region2 0.31846 0.13180 2.416 0.015683 *
## region3 0.11824 0.13020 0.908 0.363836
## region4 0.28828 0.12882 2.238 0.025226 *
## region5 -0.01239 0.12758 -0.097 0.922641
## region6 -0.07003 0.13543 -0.517 0.605094
## marital_status2 0.19873 0.12129 1.639 0.101311
## marital_status3 -16.69086 132.38178 -0.126 0.899668
## education_level.L -2.63955 0.24416 -10.811 < 2e-16 ***
## education_level.Q -0.79978 0.15213 -5.257 1.46e-07 ***
## education_level.C 0.73946 0.28177 2.624 0.008681 **
## education_level^4 0.81761 0.29403 2.781 0.005423 **
## education_level^5 0.25180 0.17168 1.467 0.142454
## health_insurance 0.03453 0.08420 0.410 0.681703
## ethnicity2 -0.02966 0.10707 -0.277 0.781759
## ethnicity3 0.27077 0.13057 2.074 0.038108 *
## ethnicity4 1.21560 0.10944 11.107 < 2e-16 ***
## wealth_index.L -0.43723 0.13081 -3.343 0.000830 ***
## wealth_index.Q -0.18217 0.11943 -1.525 0.127181
## wealth_index.C 0.27357 0.10035 2.726 0.006408 **
## wealth_index^4 -0.32427 0.08513 -3.809 0.000139 ***
## wealth_index^5 -0.10307 0.07948 -1.297 0.194712
## current_contraceptive_use 0.09590 0.06954 1.379 0.167884
## awareness_hiv_aids -0.15285 0.07775 -1.966 0.049312 *
## access_to_media -0.06893 0.08757 -0.787 0.431210
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10432.2 on 11275 degrees of freedom
## Residual deviance: 7737.2 on 11250 degrees of freedom
## (18 observations deleted due to missingness)
## AIC: 7789.2
##
## Number of Fisher Scoring iterations: 17